
## Margherita Belgioioso, Christoph Dworschak, Kristian Gleditsch  ##
## Deprivation and hate crime                                      ##
## 2021 - 2023                                                     ##
## R version 4.1.1 - 4.3.1                                         ##

packages <- list("rstudioapi", "stargazer", "effects", "sf", "sp", "spdep", "randomForest", "modelsummary", "MASS", "lmtest", "sandwich")
invisible(lapply(packages, library, character.only = TRUE))

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

load(file = "Deprivation_2023_BDG_Replication.RData")
source("helpers.R")

table(unlist(lapply(lapply(lapply(df3.shapes$geometry, st_bbox), is.infinite), any)))

main.covariates <- c("ln_pop", "age_mean", "uneployment_longterm_perct", "reli_christian", "urban", "ethn_arab_perct", "ethn_bame_perct", "police.force.name")
main.predictors <- c("Income.Score", "Living.Environment.Score", "Education.Score")
main.predictors.19 <- paste0(main.predictors, ".19"); main.predictors.19 <- c(main.predictors.19, paste0(main.predictors.19, ".splag1.mean"))
main.predictors <- c(main.predictors, paste0(main.predictors, ".splag1.mean"))
all.relevant.vars <- c(main.covariates, main.predictors)

df5 <- df4
colnames(df5) <- gsub("hcrimes\\.train", "hcrimes\\.cleaned", gsub("\\.15", "", colnames(df5)))
df5 <- df5[complete.cases(df5[,c(all.relevant.vars, "hcrimes.cleaned.p100k.ln")]),]
df5$na.19 <- as.numeric(!complete.cases(df5[,c(all.relevant.vars, "hcrimes.test2.p100k.ln")]))

df5$hcrimes.cleaned.p100k.ln.zero.omit <- ifelse(df5$hcrimes.cleaned.p100k.ln==0, NA, df5$hcrimes.cleaned.p100k.ln)
colnames(df3.shapes) <- gsub("\\.15", "", colnames(df3.shapes))

#### Descriptives ####

# print( xtable::xtable( cbind( t( apply(df5[,-1], 2, summary) ), SD = apply(df5[,-1], 2, sd)) ), type = "html")
cor(df5[,c("Income.Score", "Living.Environment.Score", "Education.Score")])


### Hate crimes:
n.color.bins <- 8
df3.shapes$color.bins <- as.factor( as.numeric( cut(df3.shapes$hcrimes.p100k.ln, n.color.bins)  ))
df3.shapes$hcrime.bins <- cut(df3.shapes$hcrimes.p100k.ln, n.color.bins)
level.exp.legend <- paste0("(", c(0, round(exp(as.numeric(gsub("^(.)(.{1,})(,.{1,}]$)", "\\2", levels(df3.shapes$hcrime.bins), perl = TRUE))))[-1]), ", ",
                           round(exp(as.numeric(gsub("^(.{1,},)(.{1,})(])$", "\\2", levels(df3.shapes$hcrime.bins), perl = TRUE)))), "]")

# jpeg("plots/hcrimes_map.jpeg", width = 500, height = 700, quality = 100)
plot(df3.shapes$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes$color.bins],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
par(xpd=TRUE)
legend("topleft", cex = 1.3,
       title = "\nNumber of \n hate crimes per 100k",
       fill = heat.colors(n.color.bins, rev = TRUE),
       legend = level.exp.legend,
       bty = "n")
par(xpd=FALSE)
dev.off()
# jpeg("plots/hcrimes_hist.jpeg", width = 700, height = 700, quality = 100)
hist(df3.shapes$hcrimes.p100k.ln, breaks = 94, xlim = c(0,10), ylim = c(0,2500), yaxt = "n", xaxt = "n", main = "",
     col = rep(heat.colors(n.color.bins, rev = TRUE), each = round(94/n.color.bins)), 
     xlab = "Number of hate crimes per 100,000, logged", cex.lab = 1.6)
axis(1, at=seq(0,10,by=2), labels=seq(0,10,by=2), cex.axis = 1.6) 
axis(2, at=seq(0,2500,by=500), labels=seq(0,2500,by=500), cex.axis = 1.6) 
dev.off()

# map.plot.fun(area.name = "Colchester")
# dev.off()


### Deprivation:
# jpeg("plots/depri_inc_map.jpeg", width = 500, height = 700, quality = 100)
n.color.bins <- 9
df3.shapes$color.bins <- as.factor( as.numeric( cut(df3.shapes$Income.Score, n.color.bins)  ))
plot(df3.shapes$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes$color.bins],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
# legend("topleft", cex = 0.8,
#        title = "Level of income deprivation",
#        fill = c(heat.colors(n.color.bins, rev = TRUE), "white"),
#        legend = c(levels(df3.shapes$color.bins), "Not observed"),
#        bty = "n")
dev.off()
# jpeg("plots/depri_inc_hist.jpeg", width = 600, height = 500, quality = 100)
hist(df5$Income.Score, breaks = 70, xlim = c(0,0.7), ylim = c(0,2500), yaxt = "n", xaxt = "n", main = "",
     col = rep(heat.colors(n.color.bins, rev = TRUE), each = round(70/n.color.bins)), 
     xlab = "Income deprivation score", cex.lab = 1.6)
axis(1, at=seq(0,0.7,by=0.1), labels=seq(0,0.7,by=0.1), cex.axis = 1.6) 
axis(2, at=seq(0,2500,by=500), labels=seq(0,2500,by=500), cex.axis = 1.6) 
dev.off()

hist(df5$Living.Environment.Score, breaks = 65)
hist(df5$Education.Score, breaks = 65)


# jpeg("plots/depri_envir_map.jpeg", width = 500, height = 700, quality = 100)
n.color.bins <- 9
cut.bins <- as.factor( as.numeric( cut(df3.shapes$Living.Environment.Score, n.color.bins)))
plot(df3.shapes$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes$color.bins],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
# legend("left", cex = 0.8,
#        fill = c(heat.colors(n.color.bins, rev = TRUE), "white"),
#        legend = c(levels(cut.bins), "Not observed"))
dev.off()
# jpeg("plots/depri_envir_hist.jpeg", width = 600, height = 500, quality = 100)
hist(df5$Living.Environment.Score, breaks = 55, xlim = c(0,100), ylim = c(0,2500), yaxt = "n", xaxt = "n", main = "",
     col = rep(heat.colors(n.color.bins, rev = TRUE), each = round(55/n.color.bins)), 
     xlab = "Environment deprivation score", cex.lab = 1.6)
axis(1, at=seq(0,100,by=20), labels=seq(0,100,by=20), cex.axis = 1.6) 
axis(2, at=seq(0,2500,by=500), labels=seq(0,2500,by=500), cex.axis = 1.6) 
dev.off()

# jpeg("plots/depri_educ_map.jpeg", width = 500, height = 700, quality = 100)
n.color.bins <- 9
cut.bins <-  as.factor( as.numeric( cut(df3.shapes$Education.Score, n.color.bins)))
plot(df3.shapes$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes$color.bins],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
# legend("left", cex = 0.8,
#        fill = c(heat.colors(n.color.bins, rev = TRUE), "white"),
#        legend = c(levels(cut.bins), "Not observed"))
dev.off()
# jpeg("plots/depri_educ_hist.jpeg", width = 600, height = 500, quality = 100)
hist(df5$Education.Score, breaks = 55, xlim = c(0,100), ylim = c(0,2500), yaxt = "n", xaxt = "n", main = "",
     col = rep(heat.colors(n.color.bins, rev = TRUE), each = round(55/n.color.bins)), 
     xlab = "Education deprivation score", cex.lab = 1.6)
axis(1, at=seq(0,100,by=20), labels=seq(0,100,by=20), cex.axis = 1.6) 
axis(2, at=seq(0,2500,by=500), labels=seq(0,2500,by=500), cex.axis = 1.6) 
dev.off()


#### Analysis ####
f.base <- formula(hcrimes.cleaned.p100k.ln ~ ln_pop + age_mean + uneployment_longterm_perct + reli_christian + urban + ethn_arab_perct + ethn_bame_perct + police.force.name)
summary( lm.zbase.1 <- lm(f.base, df5) )

### Main regressions:
summary( lm.income.1 <- lm(update(f.base, ~ Income.Score), df5) )
summary( lm.income.2 <- lm(update(f.base, ~ Income.Score + Income.Score.splag1.mean + .), df5) )
summary( lm.envir.1 <- lm(update(f.base, ~ Living.Environment.Score), df5) )
summary( lm.envir.2 <- lm(update(f.base, ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + .), df5) )
summary( lm.educ.1 <- lm(update(f.base, ~ Education.Score), df5) )
summary( lm.educ.2 <- lm(update(f.base, ~ Education.Score + Education.Score.splag1.mean + .), df5) )

### Plotting predicted values:
# Calculating predicted values on original y-scale:
x.levels <- data.frame(income = seq(min(df5$Income.Score), max(df5$Income.Score), length = 30),
                       envir = seq(min(df5$Living.Environment.Score), max(df5$Living.Environment.Score), length = 30),
                       educ = seq(min(df5$Education.Score), max(df5$Education.Score), length = 30))
set.seed(121); pred.plot.data <- boot.fun(n.sims = 100000, the.models = list(lm.income.2, lm.envir.2, lm.educ.2), x.levels = x.levels)
# save(pred.plot.data, file = "pred.plot.data.RData")
# load("pred.plot.data.RData")

# pdf("plots/main_income.pdf", width = 9, height = 8)
# tiff("plots/main_income.tiff", units="in", width = 9, height = 8, res=200)
# jpeg("plots/main_income.jpeg", width = 600, height = 450)
plot(x.levels[["income"]], pred.plot.data[,"fit.inc"], type = "l", col = "gray40", lty = 2,
     ylim = c(0, max(pred.plot.data[,grepl("upr", colnames(pred.plot.data))])), 
     xlab = "Income deprivation", ylab = "Hate crimes/100,000", bty="n")
lines(x.levels[["income"]], pred.plot.data[,"lwr.inc"], col = "gray40", lty = 3)
lines(x.levels[["income"]], pred.plot.data[,"upr.inc"], col = "gray40", lty = 3)
dev.off()

# pdf("plots/main_envireduc.pdf", width = 9, height = 8)
# tiff("plots/main_envireduc.tiff", units="in", width = 9, height = 8, res=200)
# jpeg("plots/main_envireduc.jpeg", width = 600, height = 450)
plot(x.levels[["envir"]], pred.plot.data[,"fit.env"], type = "l", col = "gray40", lty = 2,
     xlim = c(0,100), ylim = c(0, max(pred.plot.data[,grepl("upr", colnames(pred.plot.data))])), 
     xlab = "Living environment and education deprivation", ylab = "Hate crimes/100,000", bty="n")
lines(x.levels[["envir"]], pred.plot.data[,"lwr.env"], col = "gray40", lty = 3)
lines(x.levels[["envir"]], pred.plot.data[,"upr.env"], col = "gray40", lty = 3)
text(75, 160, "Living environment \ndeprivation", col = "gray30", cex = 0.9)
lines(x.levels[["educ"]], pred.plot.data[,"fit.edu"], col = "gray70", lty = 2)
lines(x.levels[["educ"]], pred.plot.data[,"lwr.edu"], col = "gray70", lty = 3)
lines(x.levels[["educ"]], pred.plot.data[,"upr.edu"], col = "gray70", lty = 3)
text(80, 30, "Education deprivation", col = "gray50", cex = 0.9)
dev.off()

### Main regression table
variable.names <- c("Income deprivation", 
                    "Living envir. deprivation",
                    "Education deprivation",
                    "Income depr. (neighb.)",
                    "Living envir. depr. (neighb.)",
                    "Education depr. (neighb.)",
                    "Population size (log)",
                    "Population age",
                    "Unemployement perct.",
                    "Christian religion perct.",
                    "Urban",
                    "Arab ethnic perct.",
                    "BAME perct.")
names(variable.names) <- all.relevant.vars[c(grep("Income.Score$", all.relevant.vars):length(all.relevant.vars), 
                                             1:grep("ethn_bame", all.relevant.vars))]
modelsummary::modelsummary(list(lm.zbase.1, lm.income.1, lm.envir.1, lm.educ.1, lm.income.2, lm.envir.2, lm.educ.2),
                           stars = FALSE, coef_map = variable.names, gof_map = c("nobs", "r.squared", "rmse"), output = "main1.html")


#### Robustness checks ####

# ZINBs & analyses without counties with zero hate crimes
summary( lm1.omit <- lm(update(f.base, hcrimes.cleaned.p100k.ln.zero.omit ~ Income.Score + Income.Score.splag1.mean + .), df5) )
summary( lm2.omit <- lm(update(f.base, hcrimes.cleaned.p100k.ln.zero.omit ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + .), df5) )
summary( lm3.omit <- lm(update(f.base, hcrimes.cleaned.p100k.ln.zero.omit ~ Education.Score + Education.Score.splag1.mean + .), df5) )
summary( zinb1 <- pscl::zeroinfl(update(f.base, round(hcrimes.cleaned.p100k) ~ Income.Score + Income.Score.splag1.mean + .), data = df5, dist = "negbin"))
summary( zinb2 <- pscl::zeroinfl(update(f.base, round(hcrimes.cleaned.p100k) ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + .), data = df5, dist = "negbin"))
summary( zinb3 <- pscl::zeroinfl(update(f.base, round(hcrimes.cleaned.p100k) ~ Education.Score + Education.Score.splag1.mean + .), data = df5, dist = "negbin"))

modelsummary::modelsummary(list(lm1.omit, lm2.omit, lm3.omit),
                           stars = FALSE, 
                           coef_map = variable.names, 
                           gof_map = c("nobs", "r.squared", "rmse"), output = "rob_omit.html")

variable.names.zinb <- c(paste0(variable.names, "_2nd"), paste0(variable.names, "_1st"))
names(variable.names.zinb) <- c(paste0("count_", names(variable.names)), paste0("zero_", names(variable.names)))
modelsummary::modelsummary(list(zinb1, zinb2, zinb3),
                           stars = FALSE, 
                           coef_map = variable.names.zinb, 
                           gof_map = c("nobs", "r.squared", "rmse"), output = "rob_zinb.html")

# Analyses with spatial interactions and without spatial lags:
summary( lm.income.2a <- lm(update(f.base, ~ Income.Score + Income.Score.splag1.mean + I(Income.Score*Income.Score.splag1.mean) + .), df5) )
summary( lm.income.2b <- lm(update(f.base, ~ Income.Score + .), df5) )
summary( lm.envir.2a <- lm(update(f.base, ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + I(Living.Environment.Score*Living.Environment.Score.splag1.mean) + .), df5) )
summary( lm.envir.2b <- lm(update(f.base, ~ Living.Environment.Score + .), df5) )
summary( lm.educ.2a <- lm(update(f.base, ~ Education.Score + Education.Score.splag1.mean + I(Education.Score*Education.Score.splag1.mean) + .), df5) )
summary( lm.educ.2b <- lm(update(f.base, ~ Education.Score + .), df5) )

modelsummary::modelsummary(list(lm.income.2b, lm.envir.2b, lm.educ.2b, lm.income.2a, lm.envir.2a, lm.educ.2a),
                           stars = FALSE, 
                           coef_map = c(variable.names[1:6], 
                                        "I(Income.Score * Income.Score.splag1.mean)" = "Income depr. * neighb.", 
                                        "I(Living.Environment.Score * Living.Environment.Score.splag1.mean)" = "Living envir. depr. * neighb.",
                                        "I(Education.Score * Education.Score.splag1.mean)" = "Education depr. * neighb.",
                                        variable.names[7:length(variable.names)]), 
                           gof_map = c("nobs", "r.squared", "rmse"), output = "rob1.html")


# Calculating predicted values on original y-scale:
x.levels.int <- data.frame(income = rep(seq(min(df5$Income.Score), max(df5$Income.Score), length = 30), 2),
                           income.neighb = rep(quantile(df5$Income.Score.splag1.mean, prob = c(0.05,0.5)), each = 30),
                           envir = rep(seq(min(df5$Living.Environment.Score), max(df5$Living.Environment.Score), length = 30), 2),
                           envir.neighb = rep(quantile(df5$Living.Environment.Score.splag1.mean, prob = c(0.05,0.5)), each = 30),
                           educ = rep(seq(min(df5$Education.Score), max(df5$Education.Score), length = 30), 2),
                           educ.neighb = rep(quantile(df5$Education.Score.splag1.mean, prob = c(0.05,0.5)), each = 30))
pred.plot.data.int <- boot.fun(n.sims = 100000, the.models = list(lm.income.2a, lm.envir.2a, lm.educ.2a), x.levels = x.levels.int, int = TRUE)
# save(pred.plot.data.int, file = "pred.plot.data.int.RData")
# load("pred.plot.data.int.RData")
pred.plot.data.int.1 <- pred.plot.data.int[pred.plot.data.int$income.neighb==unique(pred.plot.data.int$income.neighb)[1],]
pred.plot.data.int.2 <- pred.plot.data.int[pred.plot.data.int$income.neighb==unique(pred.plot.data.int$income.neighb)[2],]

# jpeg("plots/rob_int_income.jpeg", width = 600, height = 450)
plot(pred.plot.data.int.1[,"income"], 
     pred.plot.data.int.1[,"fit.inc"], 
                    type = "l", col = "gray40", lty = 2,
     ylim = c(0, max(pred.plot.data.int[,grepl("upr", colnames(pred.plot.data.int))])),
     xlab = "Income deprivation", ylab = "Hate crimes/100,000", bty="n")
lines(pred.plot.data.int.1[,"income"], pred.plot.data.int.1[,"lwr.inc"], col = "gray40", lty = 3)
lines(pred.plot.data.int.1[,"income"], pred.plot.data.int.1[,"upr.inc"], col = "gray40", lty = 3)
text(0.48, 650, "Low neighboring \ndeprivation", cex = 0.9, col = "gray30")
lines(pred.plot.data.int.2[,"income"], pred.plot.data.int.2[,"fit.inc"], col = "gray60", lty = 2)
lines(pred.plot.data.int.2[,"income"], pred.plot.data.int.2[,"lwr.inc"], col = "gray60", lty = 3)
lines(pred.plot.data.int.2[,"income"], pred.plot.data.int.2[,"upr.inc"], col = "gray60", lty = 3)
text(0.5, 40, "High neighboring deprivation", cex = 0.9, col = "gray40")
dev.off()

# jpeg("plots/rob_int_envir.jpeg", width = 600, height = 450)
plot(pred.plot.data.int.1[,"envir"], 
     pred.plot.data.int.1[,"fit.env"], 
     type = "l", col = "gray40", lty = 2,
     ylim = c(0, max(pred.plot.data.int[,grepl("upr", colnames(pred.plot.data.int))])), xlim = c(0,100),
     xlab = "Living environment deprivation", ylab = "Hate crimes/100,000", bty="n")
lines(pred.plot.data.int.1[,"envir"], pred.plot.data.int.1[,"lwr.env"], col = "gray40", lty = 3)
lines(pred.plot.data.int.1[,"envir"], pred.plot.data.int.1[,"upr.env"], col = "gray40", lty = 3)
text(80, 400, "Low neighboring \ndeprivation", cex = 0.9, col = "gray30")
lines(pred.plot.data.int.2[,"envir"], pred.plot.data.int.2[,"fit.env"], col = "gray60", lty = 2)
lines(pred.plot.data.int.2[,"envir"], pred.plot.data.int.2[,"lwr.env"], col = "gray60", lty = 3)
lines(pred.plot.data.int.2[,"envir"], pred.plot.data.int.2[,"upr.env"], col = "gray60", lty = 3)
text(65, 20, "High neighboring deprivation", cex = 0.9, col = "gray40")
dev.off()

# jpeg("plots/rob_int_educ.jpeg", width = 600, height = 450)
plot(pred.plot.data.int.1[,"educ"], 
     pred.plot.data.int.1[,"fit.edu"], 
     type = "l", col = "gray40", lty = 2,
     ylim = c(0, max(pred.plot.data.int[,grepl("upr", colnames(pred.plot.data.int))])), xlim = c(0,100),
     xlab = "Education deprivation", ylab = "Hate crimes/100,000", bty="n")
lines(pred.plot.data.int.1[,"educ"], pred.plot.data.int.1[,"lwr.edu"], col = "gray40", lty = 3)
lines(pred.plot.data.int.1[,"educ"], pred.plot.data.int.1[,"upr.edu"], col = "gray40", lty = 3)
text(80, 150, "Low neighboring deprivation", cex = 0.9, col = "gray30")
lines(pred.plot.data.int.2[,"educ"], pred.plot.data.int.2[,"fit.edu"], col = "gray60", lty = 2)
lines(pred.plot.data.int.2[,"educ"], pred.plot.data.int.2[,"lwr.edu"], col = "gray60", lty = 3)
lines(pred.plot.data.int.2[,"educ"], pred.plot.data.int.2[,"upr.edu"], col = "gray60", lty = 3)
text(65, 10, "High neighboring deprivation", cex = 0.9, col = "gray40")
dev.off()

# 3-D density plots
# jpeg("plots/rob_int_income_wrong_den.jpeg", width = 600, height = 450)
den2w <- MASS::kde2d(df5$Income.Score, df5$Income.Score.splag1.mean)
persp(den2w, box=TRUE, r = 1, theta = 40, phi = 30, 
      xlab = "Income depr.", ylab = "Neighb. income depr.", zlab = "Density")
dev.off()
den2w <- MASS::kde2d(df5$Living.Environment.Score, df5$Living.Environment.Score.splag1.mean)
persp(den2w, box=TRUE, r = 1, theta = 40, phi = 30, 
      xlab = "Envir. depr.", ylab = "Neighb. envir. depr.", zlab = "Density")
den2w <- MASS::kde2d(df5$Education.Score, df5$Education.Score.splag1.mean)
persp(den2w, box=TRUE, r = 1, theta = 40, phi = 30, 
      xlab = "Educ. depr.", ylab = "Neighb. educ. depr.", zlab = "Density")

# R&R: interactions with ethn_bame_perct and ethn_arab_perct:
summary( lm.income.2c <- lm(update(f.base, ~ Income.Score + Income.Score.splag1.mean + I(Income.Score*ethn_bame_perct) + .), df5) )
summary( lm.envir.2c <- lm(update(f.base, ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + I(Living.Environment.Score*ethn_bame_perct) + .), df5) )
summary( lm.educ.2c <- lm(update(f.base, ~ Education.Score + Education.Score.splag1.mean + I(Education.Score*ethn_bame_perct) + .), df5) )
modelsummary::modelsummary(list(lm.income.2c, lm.envir.2c, lm.educ.2c),
                           stars = FALSE, 
                           coef_map = c(variable.names[1:6], 
                                        "I(Income.Score * ethn_bame_perct)" = "Income depr. * BAME", 
                                        "I(Living.Environment.Score * ethn_bame_perct)" = "Living envir. depr. * BAME",
                                        "I(Education.Score * ethn_bame_perct)" = "Education depr. * BAME",
                                        variable.names[7:length(variable.names)]),
                           gof_map = c("nobs", "r.squared", "rmse"), output = "rob2_BAME.html")


# R&R: cluster-robust SEs:
summary( lm.income.2d <- lm(update(f.base, ~ Income.Score + Income.Score.splag1.mean + .), df5) )
lm.income.2d.cl <- coeftest(lm.income.2d, vcov = vcovCL, cluster = ~ police.force.name)
summary( lm.envir.2d <- lm(update(f.base, ~ Living.Environment.Score + Living.Environment.Score.splag1.mean + .), df5) )
lm.envir.2d.cl <- coeftest(lm.envir.2d, vcov = vcovCL, cluster = ~ police.force.name)
summary( lm.educ.2d <- lm(update(f.base, ~ Education.Score + Education.Score.splag1.mean + .), df5) )
lm.educ.2d.cl <- coeftest(lm.educ.2d, vcov = vcovCL, cluster = ~ police.force.name)
modelsummary::modelsummary(list(lm.income.2d.cl, lm.envir.2d.cl, lm.educ.2d.cl),
                           stars = FALSE, 
                           coef_map = variable.names,
                           gof_map = c("nobs", "r.squared", "rmse"), output = "rob3_ClusterSE.html")


# R&R: AIC
lapply(mget(ls(pattern = "lm\\.")), AIC)



#### Prediction ####
set.seed(1201)

features <- names(variable.names)
features.19 <- c(main.predictors.19, names(variable.names)[-c(1:6)])
f.names <- variable.names

### Logged outcome: 
rf1 <- randomForest(x = setNames(df5[,features], f.names), y = df5$hcrimes.cleaned.p100k.ln, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf1, "randomforest1.rds")
# rf1 <- readRDS("randomforest1.rds")
rf1
mean((df5$hcrimes.cleaned.p100k.ln-rf1$predicted)^2) # verifying value for MSE
1-mean((df5$hcrimes.cleaned.p100k.ln-rf1$predicted)^2) / var(df5$hcrimes.cleaned.p100k.ln) # verifying value for R2
print( rmse.1 <- sqrt( mean((df5$hcrimes.test2.p100k.ln[df5$na.19==0]-rf1$predicted[df5$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.1 <- 1-mean((df5$hcrimes.test2.p100k.ln[df5$na.19==0]-rf1$predicted[df5$na.19==0])^2) / var(df5$hcrimes.test2.p100k.ln[df5$na.19==0]) ) # calculating R2 for test2 (2019-2021)


rf2 <- randomForest(x = setNames(df5[,features[-c(1:6)]], f.names[-c(1:6)]), y = df5$hcrimes.cleaned.p100k.ln, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf2, "randomforest2.rds")
# rf2 <- readRDS("randomforest2.rds")
rf2
print( rmse.2 <- sqrt( mean((df5$hcrimes.test2.p100k.ln[df5$na.19==0]-rf2$predicted[df5$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.2 <- 1-mean((df5$hcrimes.test2.p100k.ln[df5$na.19==0]-rf2$predicted[df5$na.19==0])^2) / var(df5$hcrimes.test2.p100k.ln[df5$na.19==0]) ) # calculating R2 for test2 (2019-2021)




### Omitting municipalities with zero in outcome (different DGP):
set.seed(1201)
df5.0 <- df5[!is.na(df5$hcrimes.cleaned.p100k.ln.zero.omit),]
rf1.0 <- randomForest(x = setNames(df5.0[,features], f.names), y = df5.0$hcrimes.cleaned.p100k.ln.zero.omit, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf1.0, "randomforest1.0.rds")
# rf1.0 <- readRDS("randomforest1.0.rds")
rf1.0
print( rmse.1.0 <- sqrt( mean((df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]-rf1.0$predicted[df5.0$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.1.0 <- 1-mean((df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]-rf1.0$predicted[df5.0$na.19==0])^2) / var(df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]) ) # calculating R2 for test2 (2019-2021)

rf2.0 <- randomForest(x = setNames(df5.0[,features[-c(1:6)]], f.names[-c(1:6)]), y = df5.0$hcrimes.cleaned.p100k.ln.zero.omit, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf2.0, "randomforest2.0.rds")
# rf2.0 <- readRDS("randomforest2.0.rds")
rf2.0
print( rmse.2.0 <- sqrt( mean((df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]-rf2.0$predicted[df5.0$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.2.0 <- 1-mean((df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]-rf2.0$predicted[df5.0$na.19==0])^2) / var(df5.0$hcrimes.test2.p100k.ln[df5.0$na.19==0]) ) # calculating R2 for test2 (2019-2021)


### Original scale: 
set.seed(1201)
rf1.1 <- randomForest(x = setNames(df5[,features], f.names), y = df5$hcrimes.cleaned.p100k, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf1.1, "randomforest1_originalscale.rds")
# rf1.1 <- readRDS("randomforest1_originalscale.rds")
rf1.1
print( rmse.1.1 <- sqrt( mean((df5$hcrimes.test2.p100k[df5$na.19==0]-rf1.1$predicted[df5$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.1.1 <- 1-mean((df5$hcrimes.test2.p100k[df5$na.19==0]-rf1.1$predicted[df5$na.19==0])^2) / var(df5$hcrimes.test2.p100k[df5$na.19==0]) ) # calculating R2 for test2 (2019-2021)

rf2.1 <- randomForest(x = setNames(df5[,features[-c(1:6)]], f.names[-c(1:6)]), y = df5$hcrimes.cleaned.p100k, ntree = 500, importance = TRUE, nodesize = 10)
# saveRDS(rf2.1, "randomforest2_originalscale.rds")
# rf2.1 <- readRDS("randomforest2_originalscale.rds")
rf2.1
print( rmse.1.1 <- sqrt( mean((df5$hcrimes.test2.p100k[df5$na.19==0]-rf2.1$predicted[df5$na.19==0])^2) ) ) # calculating RMSE for test2 (2019-2021)
print( r2.1.1 <- 1-mean((df5$hcrimes.test2.p100k[df5$na.19==0]-rf2.1$predicted[df5$na.19==0])^2) / var(df5$hcrimes.test2.p100k[df5$na.19==0]) ) # calculating R2 for test2 (2019-2021)


# tiff("plots/var_imp2.tiff", units="in", width = 9, height = 7, res=200)
# jpeg("plots/var_imp2.jpeg", width = 900, height = 700, quality = 100)
varImpPlot(rf1, type=2, main = "Variable importance plot")
dev.off()


### Maps:
df3.shapes2 <- df3.shapes[row.names(df3.shapes)%in%row.names(df5[df5$na.19==0,]),]

n.color.bins <- 8
df3.shapes2$hcrime.bins <- cut(df3.shapes2$hcrimes.test2.p100k.ln, n.color.bins)
df3.shapes2$color.bins <- as.factor( as.numeric( df3.shapes2$hcrime.bins ))
level.exp.legend <- paste0("(", c(0, round(exp(as.numeric(gsub("^(.)(.{1,})(,.{1,}]$)", "\\2", levels(df3.shapes2$hcrime.bins), perl = TRUE))))[-1]), ", ",
                           round(exp(as.numeric(gsub("^(.{1,},)(.{1,})(])$", "\\2", levels(df3.shapes2$hcrime.bins), perl = TRUE)))), "]")

# jpeg("plots/map.crimes.observed.jpeg", width = 500, height = 700, quality = 100)
plot(df3.shapes2$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes2$color.bins],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
par(xpd=TRUE)
legend("topleft", cex = 1.2,
       title = "\nNumber of \n hate crimes per 100k",
       fill = heat.colors(n.color.bins, rev = TRUE),
       legend = level.exp.legend,
       bty = "n")
par(xpd=FALSE)
dev.off()

cutoff.points <- c(0, gsub("^(.{1,})(\\d\\.\\d{1,2})(])$", "\\2", levels(df3.shapes2$hcrime.bins), perl = TRUE))
cut.hcrimes.pred <- cut(rf1$predicted[df5$na.19==0], cutoff.points)
df3.shapes2$color.bins.pred <- as.factor( as.numeric( cut.hcrimes.pred ))

# jpeg("plots/map.crimes.pred.jpeg", width = 500, height = 700, quality = 100)
plot(df3.shapes2$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes2$color.bins.pred],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
par(xpd=TRUE)
legend("topleft", cex = 1.2,
       title = "\nNumber of \n hate crimes per 100k",
       fill = heat.colors(n.color.bins, rev = TRUE),
       legend = level.exp.legend,
       bty = "n")
par(xpd=FALSE)
text(-0.05,50.0, paste0("Hold out period RMSE = ", round(rmse.1,2)), cex = 1.3)
text(0.13,49.7, paste0("Hold out period R2 = ", round(r2.1,3)), cex = 1.3)
dev.off()


cut.hcrimes.pred.naive <- cut(rf2$predicted[df5$na.19==0], cutoff.points)
df3.shapes2$color.bins.pred.2 <- as.factor( as.numeric( cut.hcrimes.pred.naive ))

# jpeg("plots/map.crimes.pred.naive.jpeg", width = 500, height = 700, quality = 100)
plot(df3.shapes2$geometry, 
     col = heat.colors(n.color.bins, rev = TRUE)[df3.shapes2$color.bins.pred.2],
     main = "", axes = TRUE, graticule = TRUE, border = NA)
par(xpd=TRUE)
legend("topleft", cex = 1.2,
       title = "\nNumber of \n hate crimes per 100k",
       fill = heat.colors(n.color.bins, rev = TRUE),
       legend = level.exp.legend,
       bty = "n")
par(xpd=FALSE)
text(-0.05,50.0, paste0("Hold out period RMSE = ", round(rmse.2,2)), cex = 1.3)
text(0.13,49.7, paste0("Hold out period R2 = ", round(r2.2,3)), cex = 1.3)
dev.off()

table( as.numeric(as.character(df3.shapes2$color.bins.pred)) == as.numeric(as.character(df3.shapes2$color.bins)) )
table( as.numeric(as.character(df3.shapes2$color.bins.pred)) < as.numeric(as.character(df3.shapes2$color.bins)) )
table( as.numeric(as.character(df3.shapes2$color.bins.pred)) > as.numeric(as.character(df3.shapes2$color.bins)) )



# jpeg("plots/pred.comp.full.jpeg", width = 800, height = 800, quality = 100)
par(mar = c(6.1, 7, 4.1, 4.1))
plot(rf1.1$predicted[df5$na.19==0], df3.shapes2$hcrimes.test2.p100k, 
     ylim = c(0,4000), xlim = c(0,4000), pch = 1, col = rgb(red = 0.2, green = 0.2, blue = 0.2, alpha = 0.4),
     xlab = "Predicted rate of hate crime (full)", ylab = "", cex.lab = 2, cex = 1.5,
     axes = FALSE)
axis(1, seq(0,4000,1000), seq(0,4000,1000), cex.axis = 2)
axis(2, seq(0,4000,1000), seq(0,4000,1000), cex.axis = 2, las = 1)
title(ylab = "Observed rate of hate crime", cex.lab = 2, line = 5.5)
abline(0,1, lwd=2, lty = 3)
abline(lm(df3.shapes2$hcrimes.test2.p100k ~ rf1.1$predicted[df5$na.19==0]), lwd=2)
dev.off()


# jpeg("plots/pred.comp.null.jpeg", width = 800, height = 800, quality = 100)
par(mar = c(6.1, 7, 4.1, 4.1))
plot(rf2.1$predicted[df5$na.19==0], df3.shapes2$hcrimes.test2.p100k,
     ylim = c(0,4000), xlim = c(0,4000), pch = 1, col = rgb(red = 0.2, green = 0.2, blue = 0.2, alpha = 0.4),
     xlab = "Predicted rate of hate crime (null)", ylab = "", cex.lab = 2, cex = 1.5,
     axes = FALSE)
axis(1, seq(0,4000,1000), seq(0,4000,1000), cex.axis = 2)
axis(2, seq(0,4000,1000), seq(0,4000,1000), cex.axis = 2, las = 1)
title(ylab = "Observed rate of hate crime", cex.lab = 2, line = 5.5)
abline(0,1, lwd=2, lty = 3)
abline(lm(df3.shapes2$hcrimes.test2.p100k ~ rf2.1$predicted[df5$na.19==0]), lwd=2)
dev.off()


# jpeg("plots/pred.comp.nf.jpeg", width = 800, height = 800, quality = 100)
par(mar = c(6.1, 5.5, 4.1, 4.1))
plot(rf1$predicted[df5$na.19==0], rf2$predicted[df5$na.19==0],
     ylim = c(0,8), xlim = c(0,8), pch = 1, col = rgb(red = 0.3, green = 0.3, blue = 0.3, alpha = 0.3),
     xlab = "Predicted rate of hate crime (full, logged)", ylab = "", cex.lab = 2, cex = 1.5,
     axes = FALSE)
axis(1, seq(0,8,2), seq(0,8,2), cex.axis = 2)
axis(2, seq(0,8,2), seq(0,8,2), cex.axis = 2, las = 1)
title(ylab = "Predicted rate of hate crime (null, logged)", cex.lab = 2, line = 4)
abline(0,1, lwd=2, lty = 3)
abline(lm(rf2$predicted[df5$na.19==0] ~ rf1$predicted[df5$na.19==0]), lwd=2)
dev.off()
par(mar = c(6.1, 4.1, 4.1, 4.1))


